General workflow for data analysis

  1. check data quality

  2. visualize:

    • distributions,

    • colinearity,

    • effects (adf),

    • alternatives?

  3. analyse:

    • model quality (assumption & fit)

    • simplification & selection

Linear Regression

Goal: Estimating a best-fit line describing the relationship between 2 or more variables.

Finding the best-fit line: minimizing the unexplained variation (the residual variation), through minimizing the sums of squared deviations.

Assumptions of regression:

Residual diagnostics:

Univariate Linear Regression

We will use a dataset of caterpillar growth fed different amounts of tannin in diet. First we import the data & see columns & structure.

TANDAT<-read.csv("~/Documents/formations/R - Ecosse/courses/Mod_3_lm/tannin.csv")
str(TANDAT)
## 'data.frame':    9 obs. of  2 variables:
##  $ GROWTH: int  12 10 8 11 6 7 2 3 3
##  $ TANNIN: int  0 1 2 3 4 5 6 7 8
TANDAT
##   GROWTH TANNIN
## 1     12      0
## 2     10      1
## 3      8      2
## 4     11      3
## 5      6      4
## 6      7      5
## 7      2      6
## 8      3      7
## 9      3      8
# inspect histograms
par(mfrow=c(1,2))
hist(TANDAT$GROWTH)
hist(TANDAT$TANNIN)

par(mfrow=c(1,1))

Not pretty, but also not unusual. The sample here is very small – let’s wait until we see diagnostics before we draw conclusions about this distribution.

# plot data of interest: growth as a function of tannin
# the lwd argument changes the widths of all lines in the plots to twice the default
# the cex command changes the point size (cex is for character expansion)
plot(GROWTH~TANNIN,data=TANDAT,lwd=2,cex=1.5)

# suggests a solid negative relationship -- use this to anticipate parameters 
# could add a line here if you want, but not strictly necessary

# based on this plot I predict an intercept value of around 12, and a slope of around -1.25 (= rise/run = -10/8)

# build an initial model using lm
MOD.1<-lm(GROWTH~TANNIN,data=TANDAT)

# note that the new object has many components
str(MOD.1)
## List of 12
##  $ coefficients : Named num [1:2] 11.76 -1.22
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "TANNIN"
##  $ residuals    : Named num [1:9] 0.244 -0.539 -1.322 2.894 -0.889 ...
##   ..- attr(*, "names")= chr [1:9] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:9] -20.67 -9.42 -1.32 2.83 -1.01 ...
##   ..- attr(*, "names")= chr [1:9] "(Intercept)" "TANNIN" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:9] 11.76 10.54 9.32 8.11 6.89 ...
##   ..- attr(*, "names")= chr [1:9] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:9, 1:2] -3 0.333 0.333 0.333 0.333 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:9] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "TANNIN"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.33 1.26
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 7
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = GROWTH ~ TANNIN, data = TANDAT)
##  $ terms        :Classes 'terms', 'formula' length 3 GROWTH ~ TANNIN
##   .. ..- attr(*, "variables")= language list(GROWTH, TANNIN)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "GROWTH" "TANNIN"
##   .. .. .. ..$ : chr "TANNIN"
##   .. ..- attr(*, "term.labels")= chr "TANNIN"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(GROWTH, TANNIN)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "GROWTH" "TANNIN"
##  $ model        :'data.frame':   9 obs. of  2 variables:
##   ..$ GROWTH: int [1:9] 12 10 8 11 6 7 2 3 3
##   ..$ TANNIN: int [1:9] 0 1 2 3 4 5 6 7 8
##   ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 GROWTH ~ TANNIN
##   .. .. ..- attr(*, "variables")= language list(GROWTH, TANNIN)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "GROWTH" "TANNIN"
##   .. .. .. .. ..$ : chr "TANNIN"
##   .. .. ..- attr(*, "term.labels")= chr "TANNIN"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(GROWTH, TANNIN)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "GROWTH" "TANNIN"
##  - attr(*, "class")= chr "lm"
# before looking at the model, examine diagnostics!!!!

par(mfrow=c(2,3))
plot(MOD.1)
# note the preceding command will call four plots at once
# if you haven't changed the number of panels to plot using the par command, the R console will wait for you to hit enter between plotting each figure
hist(MOD.1$residuals)
par(mfrow=c(1,1))

# everything looks OK -- maybe slight concern at influence of datapts 4 and 7, which I could investigate(see below)

# examine the model 
summary(MOD.1)
## 
## Call:
## lm(formula = GROWTH ~ TANNIN, data = TANDAT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4556 -0.8889 -0.2389  0.9778  2.8944 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.7556     1.0408  11.295 9.54e-06 ***
## TANNIN       -1.2167     0.2186  -5.565 0.000846 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.693 on 7 degrees of freedom
## Multiple R-squared:  0.8157, Adjusted R-squared:  0.7893 
## F-statistic: 30.97 on 1 and 7 DF,  p-value: 0.0008461
# inspect residual quartiles, as well as summary stats



# check to see if datapoint 7 has undue influence: rerun model without it)

# OR perhaps more intuitively
MOD.2<-update(MOD.1,data=TANDAT[-7,])


summary(MOD.2)
## 
## Call:
## lm(formula = GROWTH ~ TANNIN, data = TANDAT[-7, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4549 -0.9572 -0.1622  0.4572  2.6622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.6892     0.8963  13.042 1.25e-05 ***
## TANNIN       -1.1171     0.1956  -5.712  0.00125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.457 on 6 degrees of freedom
## Multiple R-squared:  0.8446, Adjusted R-squared:  0.8188 
## F-statistic: 32.62 on 1 and 6 DF,  p-value: 0.001247
# both parameters have changed, but in my opinion the change is not dramatic.
# in the absence of further justification for excluding (7), I would retain the first model

MOD.3<-update(MOD.1,subset=(TANNIN!=9))
summary(MOD.3)
## 
## Call:
## lm(formula = GROWTH ~ TANNIN, data = TANDAT, subset = (TANNIN != 
##     9))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4556 -0.8889 -0.2389  0.9778  2.8944 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.7556     1.0408  11.295 9.54e-06 ***
## TANNIN       -1.2167     0.2186  -5.565 0.000846 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.693 on 7 degrees of freedom
## Multiple R-squared:  0.8157, Adjusted R-squared:  0.7893 
## F-statistic: 30.97 on 1 and 7 DF,  p-value: 0.0008461
# as above, no dramatic change. One can conclude that neither of these points is unduly influencing the results


# examine the first model again 
summary(MOD.1)
## 
## Call:
## lm(formula = GROWTH ~ TANNIN, data = TANDAT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4556 -0.8889 -0.2389  0.9778  2.8944 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.7556     1.0408  11.295 9.54e-06 ***
## TANNIN       -1.2167     0.2186  -5.565 0.000846 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.693 on 7 degrees of freedom
## Multiple R-squared:  0.8157, Adjusted R-squared:  0.7893 
## F-statistic: 30.97 on 1 and 7 DF,  p-value: 0.0008461
# parameters conform well to expectations 

# interpretation on several levels

# (1) IMO, the most important conclusion is that for every increase of 1 unit of dose of tannin, there is
# a corresponding decrease of 1.22 units of growth
 
# (2) In the studied sample of caterpillars, the dietary tannin level explained 81.6% of variation in growth (the mult. R-squared value)
 
# (3) If you care about p-values, the whole model is significant, and both coefficients are significantly nonzero

#  This means that we can reject the null model (in regression, this is usually a model with only an intercept, which explains why the p-value for the slope term is the same as that for the whole model)
# We can also conclude that these data are unlikely if the "true" regression has a slope of zero
# If you were describing these results in a MS, you might produce a table of coefficients, and/or also cite the stats in the text as follows: Increased dietary tannin levels caused significant decreases in caterpillar growth  (regression slope (±SE) = -1.217 (±0.219),N=9,t=-5.565,P=0.0008; see Figure 1).

# Some people prefer to cite F-values, but note that this only works if the only term in the model is the one of interest; otherwise, the F-value won't be the same as that for the coefficient of interest. Also, because the F-value is associated with the whole model, and not the parameter, you can't really point out the parameter in the same parenthetical, which is why I prefer the example above. But for completeness, here's how you might cite the F:
# Increased dietary tannin levels caused significant decreases in caterpillar growth (F(1,7DF)=30.97,P=0.0008; see Figure 1)  

# if you want to see the ANOVA table, use one of the following:
summary.aov(MOD.1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## TANNIN       1  88.82   88.82   30.97 0.000846 ***
## Residuals    7  20.07    2.87                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(MOD.1)
## Analysis of Variance Table
## 
## Response: GROWTH
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## TANNIN     1 88.817  88.817  30.974 0.0008461 ***
## Residuals  7 20.072   2.867                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# note how much less information rich ANOVA tables are compared to tables of coefficients


# Create a sequence of new x values
NEWXVARS <- seq(0,8,length=100)

# Create a sequence of predicted y values and confidence intervals
NEWYVARS <- predict(MOD.1,
                    newdata=list(TANNIN=NEWXVARS),
                    int="c")

# Check the structure of your predictions
str(NEWYVARS)
##  num [1:100, 1:3] 11.8 11.7 11.6 11.5 11.4 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:100] "1" "2" "3" "4" ...
##   ..$ : chr [1:3] "fit" "lwr" "upr"
##
# Create a new data frame for your predictions:
#  newx: your range of new x values
#  newy: predicted y values for your range of x
#  newupr: the upper 95% confidence intervals
#  newlwr: the lower 95% confidence intervals
##
df.newvars <- data_frame(newx = NEWXVARS,
                         newy = as.numeric(NEWYVARS[,"fit"]),
                         newupr = as.numeric(NEWYVARS[,"upr"]),
                         newlwr = as.numeric(NEWYVARS[,"lwr"]))

ggplot(df.newvars, aes(x = newx, y = newy)) +
  geom_line() +
  geom_ribbon(aes(ymin = newlwr, 
                  ymax = newupr),
              alpha = 0.25) +
  geom_point(data = TANDAT, 
             aes(x = TANNIN,
                 y = GROWTH)) +
  labs(x = "Tannin",
       y = "Growth") +
  theme_bw()

Multivariate Linear Regression

Workflow Summary:

  1. create the most complete model.

  2. detect for colinearity with vif(), remove the colinearity.

  3. use likelihood ratio tests to find the minimal adequate model, anova is one of them.

  4. get minimal model

## First we read the data and print its characteristics
data_path = "~/Documents/formations/R - Ecosse/courses/Mod_4_mult_regr/WOLFPUPS.csv"
data_to_model = data.table(read.csv(data_path))

print(summary(data_to_model))
##       YEAR          ADULTS          CALVES          WORMS        
##  Min.   :1955   Min.   :234.0   Min.   : 50.0   Min.   :0.01152  
##  1st Qu.:1966   1st Qu.:482.0   1st Qu.:102.0   1st Qu.:0.69826  
##  Median :1977   Median :542.0   Median :111.0   Median :0.85540  
##  Mean   :1977   Mean   :546.1   Mean   :119.5   Mean   :0.81989  
##  3rd Qu.:1988   3rd Qu.:624.0   3rd Qu.:131.0   3rd Qu.:1.00688  
##  Max.   :2016   Max.   :826.0   Max.   :345.0   Max.   :1.21979  
##     PERMITS           PUPS       
##  Min.   :16.00   Min.   : 3.000  
##  1st Qu.:20.00   1st Qu.: 7.000  
##  Median :23.00   Median :10.000  
##  Mean   :24.69   Mean   : 9.956  
##  3rd Qu.:30.00   3rd Qu.:12.000  
##  Max.   :35.00   Max.   :19.000
print(str(data_to_model))
## Classes 'data.table' and 'data.frame':   45 obs. of  6 variables:
##  $ YEAR   : int  1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 ...
##  $ ADULTS : int  559 287 459 599 765 340 377 506 530 517 ...
##  $ CALVES : int  122 67 98 120 158 72 105 106 109 109 ...
##  $ WORMS  : num  0.518 0.95 0.537 1.032 0.991 ...
##  $ PERMITS: int  24 21 33 35 28 25 18 18 17 22 ...
##  $ PUPS   : int  13 6 13 6 10 5 13 7 5 12 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL

Now we plot the histograms for each column that is not a factor to detect abnormal values.

##          [,1]                             [,2]                            
## breaks   Numeric,8                        Numeric,8                       
## counts   Integer,7                        Integer,7                       
## density  Numeric,7                        Numeric,7                       
## mids     Numeric,7                        Numeric,7                       
## xname    "data.frame(data_to_model)[, i]" "data.frame(data_to_model)[, i]"
## equidist TRUE                             TRUE                            
##          [,3]                             [,4]                            
## breaks   Numeric,7                        Numeric,8                       
## counts   Integer,6                        Integer,7                       
## density  Numeric,6                        Numeric,7                       
## mids     Numeric,6                        Numeric,7                       
## xname    "data.frame(data_to_model)[, i]" "data.frame(data_to_model)[, i]"
## equidist TRUE                             TRUE                            
##          [,5]                             [,6]                            
## breaks   Numeric,11                       Numeric,10                      
## counts   Integer,10                       Integer,9                       
## density  Numeric,10                       Numeric,9                       
## mids     Numeric,10                       Numeric,9                       
## xname    "data.frame(data_to_model)[, i]" "data.frame(data_to_model)[, i]"
## equidist TRUE                             TRUE

Sometimes the histogram cannot be used as it and we use a transform, eg. sqrt(), log(), log10() to see if it looks better. In this case it may be a sign that the model will use the tranform too.

If we detect some data that is not contiguous with the others, it may be an outlier. First observation is a lack of several years of data in the time serie. Year 2016 seems an outlier so we remove it.

data_to_model = data_to_model %>% filter(YEAR != 2016)

We use some ggplot for scatterplots. We need to know which is the response, which are the predictors?

RESPONSE=quote(data_terms[6])   # PUPS
PREDICTOR1=quote(data_terms[3]) # CALVES
PREDICTOR2=quote(data_terms[4]) # WORMS

ggplot(data=data_to_model, aes_string(x = eval(PREDICTOR1), y = eval(RESPONSE), color = eval(PREDICTOR2))) + geom_point() + theme_bw()

We examine possible multicollinearity as well as pairwise relationships with pairs().

pairs(data_to_model)

The ggpairs() function also helps providing scatterplot, densities and correlation values. We use these to detect potential colinearity.

ggpairs(data_to_model)

It looks like CALVES and ADULTS are colinear, their relationship is a straight line.

We build a maximal model but without interactions for the beginning. Then examine diagnostics.

MOD.1<-lm(PUPS~ADULTS+CALVES+WORMS+PERMITS,data=data_to_model)
par(mfrow=c(2,3))
plot(MOD.1)
hist(MOD.1$residuals)
par(mfrow=c(1,1))

## we save these types of plots for later.
# avPlots(MOD.1)
# visreg(MOD.1)

Q-Q plot is nice, Residuals vs Fitted show no particular pattern but could be better (it may have a diamond shape). Histogram of residuals is centered around 0 so seems ok. However we detect on the Residuals vs Leverage one point that has a strong influence (point 19). We can also take a look at the same diagnostics for the different transforms earlier mentioned but in this case the original model is not worse and is simpler so we keep it.

Now we check for variance inflation using the vif() function from the {car} package. If vif() gives a score over 10, it means that there is probably multicolinearity. In case it is over 4-5 it may also be a symptom of multicolinearity. In this case we have to remove (at least) one of the terms.

vif(MOD.1)
##    ADULTS    CALVES     WORMS   PERMITS 
## 15.592589 15.387900  1.266965  1.016862

Here we have 2 values above 10: ADULTS and CALVES. They are thus colinear as we were expecting. We update the model getting rid of one of them. Then we verify that multicolinearity has disapeared.

MOD.2a<-update(MOD.1,~. - CALVES)

# reexamine vifs
vif(MOD.2a)
##   ADULTS    WORMS  PERMITS 
## 1.252128 1.266763 1.016858

Note that we did not use the anova() command to compare MOD.1 and MOD.2. Why not? The order of operations here is extremely important. First, we found a model that was not troubled by collinearity, then we used likelihood ratio tests (with the anova() function) to find the minimal adequate model. Never use anova to decide whether to retain highly collinear variables, as their collinearity is more than enough reason to get rid of them, and is expected to artificially and inappropriately decrease model deviance.

We will update the model like this several times to find the minimal adequate model. Can we remove other terms? We assess the adequation of the model with anova(), we are looking for a simpler model that does not increase the RSS (residual sum of squares) too much. In this case, we keep the most simple model, then we compare both models in terms of information retrieval ratio with AIC(). The lower the better.

Warning using anova between 2 models we look at RSS (residual sum of squares). If RSS not too high, ok we can prefer the simpler model. But anova used to compare only 2 nested models, i.e. only one term differs.

MOD.2b<-update(MOD.1,~. - ADULTS)

MOD.3<-update(MOD.2b,~. - PERMITS)

# reexamine vifs
vif(MOD.3)
##   CALVES    WORMS 
## 1.235484 1.235484
# compare both models
anova(MOD.2b, MOD.3)
## Analysis of Variance Table
## 
## Model 1: PUPS ~ CALVES + WORMS + PERMITS
## Model 2: PUPS ~ CALVES + WORMS
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     40 279.05                           
## 2     41 280.85 -1   -1.8028 0.2584  0.614
AIC(MOD.2b)
## [1] 216.1427
AIC(MOD.3)
## [1] 214.426

We can display the coefficients and confidence intervals for the simplest adequate model:

summary(MOD.3)
## 
## Call:
## lm(formula = PUPS ~ CALVES + WORMS, data = data_to_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8239 -1.7410 -0.1149  1.7320  6.1227 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.26475    2.81364   3.648 0.000739 ***
## CALVES       0.04396    0.01552   2.833 0.007127 ** 
## WORMS       -6.76661    1.83962  -3.678 0.000676 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.617 on 41 degrees of freedom
## Multiple R-squared:  0.4802, Adjusted R-squared:  0.4548 
## F-statistic: 18.93 on 2 and 41 DF,  p-value: 1.498e-06
confint(MOD.3)
##                    2.5 %      97.5 %
## (Intercept)   4.58248000 15.94701170
## CALVES        0.01261874  0.07530036
## WORMS       -10.48179961 -3.05141551
avPlots(MOD.3)

# visreg(MOD.3, xvar="CALVES", by="WORMS", overlay=TRUE)

Z-tranformation: by calling scale() before the terms at the model creation we set them on the same scale and thus compare their relative importance. But be carefull to also look at confint() to not misinterpret the result.

MOD.4std<-lm(PUPS~scale(CALVES)+scale(WORMS),data=data_to_model)
summary(MOD.4std)
## 
## Call:
## lm(formula = PUPS ~ scale(CALVES) + scale(WORMS), data = data_to_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8239 -1.7410 -0.1149  1.7320  6.1227 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     9.7500     0.3946  24.711  < 2e-16 ***
## scale(CALVES)   1.2567     0.4436   2.833 0.007127 ** 
## scale(WORMS)   -1.6318     0.4436  -3.678 0.000676 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.617 on 41 degrees of freedom
## Multiple R-squared:  0.4802, Adjusted R-squared:  0.4548 
## F-statistic: 18.93 on 2 and 41 DF,  p-value: 1.498e-06
# According to this summary, WORMS has a slightly more important effect than CALVES on PUPS, but the absolute values of confidence intervals for the two coefficients overlap. Can use the confint() command to be sure
confint(MOD.4std)
##                    2.5 %     97.5 %
## (Intercept)    8.9531619 10.5468381
## scale(CALVES)  0.3607337  2.1526214
## scale(WORMS)  -2.5277573 -0.7358696

Now that we have found our model we will use the predict() function to generate predictions and plot them.

NEWWORMS<-seq(0.01,1.22,0.01)

# the next line tells me how many rows are in NEWWORMS
length(NEWWORMS)
## [1] 122
# the next line will create a vector with the same number of rows, but all set to the mean value for CALVES
MEANCALVES<-rep(mean(data_to_model$CALVES),122)

# now use predict to generate y-variables
# note that the newdata statement below must include a reference for ALL predictors in the original model
# by setting the CALVES column to be a series of means, we're effectively controlling for the influence of CALVES, and only getting the partial effect of WORMS
NEWYUSINGWORMS<-predict(MOD.3,newdata=list(WORMS=NEWWORMS,CALVES=MEANCALVES),int="c")

# plot the original data
plot(data_to_model$WORMS,data_to_model$PUPS,xlab="Infection score",ylab="Pups born",ylim=c(0,18))
matlines(NEWWORMS,NEWYUSINGWORMS,lty=c(1,2,2),col="black")

# NB this looks like a somewhat "shallow" fit, but this is because some of the trend is caused by the correlated variable
# The "advantage" of this approach is that the data are readily interpretable in raw units

# like vif(), the avPlots function is from the {car} package, so remember to load it if necessary
# generate avPlots to examine partial effects
avPlots(MOD.3)

# note that the pattern for scaled predictors is the same, but the x-axis units are different

# make publication-quality plots
avPlots(MOD.3,terms="WORMS",xlab="Residual infection score",ylab="Residual pups born",xlim=c(-0.6,0.6),ylim=c(-5,5))

avPlots(MOD.3,terms="CALVES",xlab="Residual calves born",ylab="Residual pups born",xlim=c(-60,60),ylim=c(-4,6))

# to do this by hand, would need resids from two different models
RESWORMSONOTHERPREDICTORS<-lm(WORMS~CALVES,data=data_to_model)
RESPUPSNOWORMS<-lm(PUPS~CALVES,data=data_to_model)
plot(RESWORMSONOTHERPREDICTORS$residuals,RESPUPSNOWORMS$residuals,xlab="Residual infection score",ylab="Residual pups born",xlim=c(-0.6,0.6),ylim=c(-5,5))
abline(lm(RESPUPSNOWORMS$residuals~RESWORMSONOTHERPREDICTORS$residuals))

# this plot should look just like the one generated with avPlots
# you could even add confidence intervals by using a predict command on the linear model nested within line 219 


# INTERPRETATION

# remember that because of strong correlation between CALVES and ADULTS, we can't say that the effect of CALVES in this model is actually due to CALVES or ADULTS. In the discussion, we would need to attend to the implications of both possibilities, even though we're only reporting a parameter estimate for one of them.

# Sample Results text:

# My minimal adequate model of wolf recruitment rates included a significant negative effect for infection status (Slope B = -6.77; Standardized regression coefficient Beta = -1.63; t = -3.68,P= 0.0007; see Figure 1A) and a positive effect for the number of moose calves (B = 0.044, Beta = 1.26; t = 2.83; P = 0.0071; see Figure 1B). Recall that moose calf numbers were collinear with adult population size, so I cannot distinguish between the effects of adults or calves on wolf recruitment.

ANCOVA: factorial and continuous predictors

Interactions in model terms. Does the effect of one predictor depends on another one? To answer this question we use factorial predictors. We group the observations by classes (we use the factor() command).

example: we have 2 groups corresponding to 2 observation sites. Once we have the model: The intercept is the mean of group 1 and the slope is the difference between mean of group 1 and mean of group 2. In this case the null hypothesis is: there is no difference between the 2 groups.

AIC. Anova works for nested models. If the models are not, we need to use AIC(). This is because randomness increase the noise and thus may increase likelihood. AIC, penalizes the model complexity (number of coefficients). For AIC, the lower is the better. LogLik() is also possible but is not penalized by noise from too many parameters.

Warning: testing for vif() in a model with interactions does not make sense. There is obviously a colinearity in an interaction. Thus we test and remove the colinearity before adding the interaction to the model.

Technically in R we test the interaction with a *, equivalent to a + and a : (the interaction itself).

When simplifying a model with interaction, it has to be by trying to removing the interaction itself first. It is logical because all the terms in the interaction have to be in the model.

When the slopes are different in the model, this means that there is actually an interaction. If not really, maybe the interaction should be removed. Thus looking at the slope, the slope deviation (and their respective std) we should be able to tell if they are parallel (ther is in fact no interaction and we remove it) or not.

data_comp = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_5_ancova/compensationo.csv"))
print(summary(data_comp))
##       ROOT            FRUIT            GRAZING  
##  Min.   : 4.426   Min.   : 14.73   Grazed  :20  
##  1st Qu.: 6.083   1st Qu.: 41.15   Ungrazed:20  
##  Median : 7.123   Median : 60.88                
##  Mean   : 7.181   Mean   : 59.41                
##  3rd Qu.: 8.510   3rd Qu.: 76.19                
##  Max.   :10.253   Max.   :116.05
ggplot(data=data_comp, aes(x=ROOT, y=FRUIT, color=GRAZING)) + theme_bw() + geom_point() 

ggplot(data=data_comp, aes(x=ROOT, y=FRUIT, color=GRAZING)) + theme_bw() + geom_boxplot() 
## Warning: position_dodge requires constant width: output may be incorrect
## Warning: position_dodge requires non-overlapping x intervals

ggplot(data=data_comp, aes(x=ROOT, y=FRUIT, color=GRAZING)) + theme_bw() + geom_point() + geom_smooth(method="lm", fullrange = TRUE, se = FALSE) + expand_limits(x = 0, y = 0)

lm_plus = lm(FRUIT~ROOT+GRAZING, data=data_comp)
summary(lm_plus)
## 
## Call:
## lm(formula = FRUIT ~ ROOT + GRAZING, data = data_comp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1920  -2.8224   0.3223   3.9144  17.3290 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -127.829      9.664  -13.23 1.35e-15 ***
## ROOT              23.560      1.149   20.51  < 2e-16 ***
## GRAZINGUngrazed   36.103      3.357   10.75 6.11e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.747 on 37 degrees of freedom
## Multiple R-squared:  0.9291, Adjusted R-squared:  0.9252 
## F-statistic: 242.3 on 2 and 37 DF,  p-value: < 2.2e-16
vif(lm_plus)
##     ROOT  GRAZING 
## 2.475973 2.475973
par(mfrow=c(2,2))
plot(lm_plus)

par(mfrow=c(1,1))

lm_interaction = lm(FRUIT~ROOT:GRAZING, data=data_comp)
summary(lm_interaction)
## 
## Call:
## lm(formula = FRUIT ~ ROOT:GRAZING, data = data_comp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4030  -3.0094   0.1571   4.5053  15.5703 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -107.348      8.576  -12.52 7.22e-15 ***
## ROOT:GRAZINGGrazed     21.126      1.035   20.42  < 2e-16 ***
## ROOT:GRAZINGUngrazed   26.099      1.413   18.47  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.044 on 37 degrees of freedom
## Multiple R-squared:  0.9227, Adjusted R-squared:  0.9185 
## F-statistic: 220.8 on 2 and 37 DF,  p-value: < 2.2e-16
# vif(lm_interaction)
par(mfrow=c(2,2))
plot(lm_interaction)

par(mfrow=c(1,1))

lm_all = lm(FRUIT~ROOT*GRAZING, data=data_comp)
lm_all = lm(FRUIT~ROOT+GRAZING+ROOT:GRAZING, data=data_comp)


summary(lm_all)
## 
## Call:
## lm(formula = FRUIT ~ ROOT + GRAZING + ROOT:GRAZING, data = data_comp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3177  -2.8320   0.1247   3.8511  17.1313 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -125.173     12.811  -9.771 1.15e-11 ***
## ROOT                   23.240      1.531  15.182  < 2e-16 ***
## GRAZINGUngrazed        30.806     16.842   1.829   0.0757 .  
## ROOT:GRAZINGUngrazed    0.756      2.354   0.321   0.7500    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.831 on 36 degrees of freedom
## Multiple R-squared:  0.9293, Adjusted R-squared:  0.9234 
## F-statistic: 157.6 on 3 and 36 DF,  p-value: < 2.2e-16
vif(lm_all)
##         ROOT      GRAZING ROOT:GRAZING 
##     4.289857    60.794195    45.250179
par(mfrow=c(2,2))
plot(lm_all)

par(mfrow=c(1,1))

## vif shows that lm_all has more than 10 for GRAZING and  ROOT:GRAZING 
# we redo the model without GRAZING
lm_simple = lm(FRUIT~ROOT, data=data_comp)
summary(lm_simple)
## 
## Call:
## lm(formula = FRUIT ~ ROOT, data = data_comp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.3844 -10.4447  -0.7574  10.7606  23.7556 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -41.286     10.723  -3.850 0.000439 ***
## ROOT          14.022      1.463   9.584  1.1e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.52 on 38 degrees of freedom
## Multiple R-squared:  0.7073, Adjusted R-squared:  0.6996 
## F-statistic: 91.84 on 1 and 38 DF,  p-value: 1.099e-11
# vif(lm_simple)
par(mfrow=c(2,2))
plot(lm_simple)

par(mfrow=c(1,1))

anova(lm_plus, update(lm_plus, ~. - GRAZING))
## Analysis of Variance Table
## 
## Model 1: FRUIT ~ ROOT + GRAZING
## Model 2: FRUIT ~ ROOT
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     37 1684.5                                  
## 2     38 6948.8 -1   -5264.4 115.63 6.107e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_plus, lm_simple)
## Analysis of Variance Table
## 
## Model 1: FRUIT ~ ROOT + GRAZING
## Model 2: FRUIT ~ ROOT
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     37 1684.5                                  
## 2     38 6948.8 -1   -5264.4 115.63 6.107e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
logLik(lm_simple)
## 'log Lik.' -159.9065 (df=3)
logLik(lm_plus) ## best logLik
## 'log Lik.' -131.564 (df=4)
logLik(lm_interaction)
## 'log Lik.' -133.2841 (df=4)
logLik(lm_all) ## best logLik
## 'log Lik.' -131.5068 (df=5)
AIC(lm_simple)
## [1] 325.8131
AIC(lm_plus) ## best AIC
## [1] 271.1279
AIC(lm_interaction)
## [1] 274.5682
AIC(lm_all)
## [1] 273.0135
coef(summary(lm_plus))
##                   Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)     -127.82936   9.664095 -13.22725 1.349804e-15
## ROOT              23.56005   1.148771  20.50892 8.408231e-22
## GRAZINGUngrazed   36.10325   3.357396  10.75335 6.107286e-13
# > summary(lm_all)
# 
# Call:
#   lm(formula = FRUIT ~ ROOT * GRAZING, data = data_comp)
# 
# Residuals:
#   Min       1Q   Median       3Q      Max 
# -17.3177  -2.8320   0.1247   3.8511  17.1313 
# 
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)    
# (Intercept)          -125.173     12.811  -9.771 1.15e-11 ***
#   ROOT                   23.240      1.531  15.182  < 2e-16 ***
#   GRAZINGUngrazed        30.806     16.842   1.829   0.0757 .  
# ROOT:GRAZINGUngrazed    0.756      2.354   0.321   0.7500    
# ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 6.831 on 36 degrees of freedom
# Multiple R-squared:  0.9293,  Adjusted R-squared:  0.9234 
# F-statistic: 157.6 on 3 and 36 DF,  p-value: < 2.2e-16
# 
# > summary(lm_plus)
# 
# Call:
#   lm(formula = FRUIT ~ ROOT + GRAZING, data = data_comp)
# 
# Residuals:
#   Min       1Q   Median       3Q      Max 
# -17.1920  -2.8224   0.3223   3.9144  17.3290 
# 
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)    
# (Intercept)     -127.829      9.664  -13.23 1.35e-15 ***
#   ROOT              23.560      1.149   20.51  < 2e-16 ***
#   GRAZINGUngrazed   36.103      3.357   10.75 6.11e-13 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 6.747 on 37 degrees of freedom
# Multiple R-squared:  0.9291,  Adjusted R-squared:  0.9252 
# F-statistic: 242.3 on 2 and 37 DF,  p-value: < 2.2e-16



### slope deviation is 0.756 with 2.35 std so they are the same....
## thus we get rid of interaction and keep lm_simple
## slope is 23.56, intercept difference for ungrazed is +36.103


data_to_predict = copy(data_comp)
data_to_predict[,FRUIT:=NULL]
d1 = data.frame(predicted=predict(lm_plus, data_to_predict))
d2 = data.frame(observed=data_comp$FRUIT)

d1$sample = as.numeric(rownames(d1))
d2$sample = as.numeric(rownames(d2))

dd = merge(d1, d2)
dd_g = gather(dd, type, value, predicted:observed)

ggplot(data=dd_g, aes(x=sample, y=value, fill=type)) + theme_bw() + geom_bar(stat="identity", position="dodge")

## T-test will be used to reject the null hypothesis: the samples come from the same group.
## we use the paired = TRUE to say that the order is important
t.test(d1$predict, d2$observed, paired = TRUE)
## 
##  Paired t-test
## 
## data:  d1$predict and d2$observed
## t = 6.4961e-15, df = 39, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.101831  2.101831
## sample estimates:
## mean of the differences 
##            6.750308e-15
NEWGRAZED<-expand.grid(GRAZING="Grazed",ROOT=seq(4,11,0.1))
NEWUNGRAZED<-expand.grid(GRAZING="Ungrazed",ROOT=seq(4,11,0.1))
# have a look at what this produces!
head(NEWGRAZED)
##   GRAZING ROOT
## 1  Grazed  4.0
## 2  Grazed  4.1
## 3  Grazed  4.2
## 4  Grazed  4.3
## 5  Grazed  4.4
## 6  Grazed  4.5
# the predicted y values come by using the predict command and specifying which vector to substitute for 
# the x variables in the original regression. Note that here we need to tell the predict command where to find 
# the Grazing level as well as the Root measure
# The argument int="c" tells matlines to draw 95% confidence intervals
YVGRAZED<-predict(lm_plus,list(GRAZING=NEWGRAZED$GRAZING,ROOT=NEWGRAZED$ROOT),int="c")
YVUNGRAZED<-predict(lm_plus,list(GRAZING=NEWUNGRAZED$GRAZING,ROOT=NEWUNGRAZED$ROOT),int="c")


df_graze_pred <- data_frame(GRAZING = factor(c(rep("Grazing", 
                                                   times = length(NEWGRAZED$GRAZING)), 
                                               rep("Ungrazed",
                                                   times = length(NEWUNGRAZED$GRAZING)))),
                            ROOT = c(NEWGRAZED$ROOT, NEWUNGRAZED$ROOT),
                            FRUIT_PRED = as.numeric(c(YVGRAZED[,"fit"], YVUNGRAZED[,"fit"])),
                            FRUIT_LWR = as.numeric(c(YVGRAZED[,"lwr"], YVUNGRAZED[,"lwr"])),
                            FRUIT_UPR = as.numeric(c(YVGRAZED[,"upr"], YVUNGRAZED[,"upr"])))

##
# Create ggplot from predicted data,
#  then plot points over the top of this
##
ggplot(df_graze_pred, aes(x = ROOT, y = FRUIT_PRED,
                          group = GRAZING)) +
  geom_line() +
  geom_ribbon(aes(ymin = FRUIT_LWR,
                  ymax = FRUIT_UPR),
              alpha = 0.1) +
  geom_point(data = data_comp, aes(x = ROOT, y = FRUIT,
                              colour = GRAZING),
             size = 4, alpha = 0.7) +
  labs(x = "Initial root diameter",
       y = "Seed production") +
  theme_classic()

Generalized Linear Models

Why generalizing the linear model? If response is discrete or error non normal.

Interesting in the case of “Difficult” response variables:

Four common error structures:

Overdispersion and underdispersion:

Warning: If the model uses log, logit, power or other kind of transformation we need to get the predictions in link space and not respoce space to get correct SE. Then when plotting we transform back to response space.

data_species = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_6_generalized_mods/SPECIES.csv"))
print(summary(data_species))
##     PH        BIOMASS           RICHNESS    
##  high:30   Min.   :0.05017   Min.   : 2.00  
##  low :30   1st Qu.:1.44132   1st Qu.:12.25  
##  mid :30   Median :3.10836   Median :18.50  
##            Mean   :3.55701   Mean   :19.46  
##            3rd Qu.:5.08570   3rd Qu.:25.75  
##            Max.   :9.98177   Max.   :44.00
print(str(data_species))
## Classes 'data.table' and 'data.frame':   90 obs. of  3 variables:
##  $ PH      : Factor w/ 3 levels "high","low","mid": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BIOMASS : num  0.469 1.731 2.09 3.926 4.367 ...
##  $ RICHNESS: int  30 39 44 35 25 29 23 18 19 12 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(data_species)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggplot(data=data_species, aes(x=RICHNESS)) + theme_bw() + geom_bar(binwidth=3)

ggplot(data=data_species, aes(x=BIOMASS)) + theme_bw() + geom_bar(binwidth=1)

## RICHNESS is discrete. Bounded to 0. looks like poisson distribution.
## BIOMASS is continuous. Bounded to 0

p1 = ggplot(data_species, aes(x=BIOMASS, y=RICHNESS, color=PH)) + theme_bw() + geom_point() + geom_smooth(method="lm")
p2 = ggplot(data_species) + theme_bw() + geom_point(aes(x=BIOMASS, y=RICHNESS))
p3 = ggplot(data_species) + theme_bw() + geom_point(aes(x=PH, y=RICHNESS))

grid.arrange(p1, p2, p3, ncol=3)

p_log = ggplot(data_species, aes(x=BIOMASS, y=log(RICHNESS), color=PH)) + theme_bw() + geom_point() + geom_smooth(method="lm", se=FALSE)
print(p_log)

## if we use log space we can see that there is an interaction.


MOD.1<-glm(RICHNESS~BIOMASS*PH,data=data_species,family=poisson)
summary(MOD.1)
## 
## Call:
## glm(formula = RICHNESS ~ BIOMASS * PH, family = poisson, data = data_species)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4978  -0.7485  -0.0402   0.5575   3.2297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.76812    0.06153  61.240  < 2e-16 ***
## BIOMASS       -0.10713    0.01249  -8.577  < 2e-16 ***
## PHlow         -0.81557    0.10284  -7.931 2.18e-15 ***
## PHmid         -0.33146    0.09217  -3.596 0.000323 ***
## BIOMASS:PHlow -0.15503    0.04003  -3.873 0.000108 ***
## BIOMASS:PHmid -0.03189    0.02308  -1.382 0.166954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 452.346  on 89  degrees of freedom
## Residual deviance:  83.201  on 84  degrees of freedom
## AIC: 514.39
## 
## Number of Fisher Scoring iterations: 4
## Residual deviance:  83.201  on 84  degrees of freedom so we are good.

### try other stuff to see ho it is going
summary(glm(RICHNESS~BIOMASS+PH,data=data_species,family=poisson))  ## might be ok
## 
## Call:
## glm(formula = RICHNESS ~ BIOMASS + PH, family = poisson, data = data_species)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5959  -0.6989  -0.0737   0.6647   3.5604  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.84894    0.05281  72.885  < 2e-16 ***
## BIOMASS     -0.12756    0.01014 -12.579  < 2e-16 ***
## PHlow       -1.13639    0.06720 -16.910  < 2e-16 ***
## PHmid       -0.44516    0.05486  -8.114 4.88e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 452.346  on 89  degrees of freedom
## Residual deviance:  99.242  on 86  degrees of freedom
## AIC: 526.43
## 
## Number of Fisher Scoring iterations: 4
summary(glm(RICHNESS~BIOMASS*PH,data=data_species,family=gaussian)) ## overdispersion
## 
## Call:
## glm(formula = RICHNESS ~ BIOMASS * PH, family = gaussian, data = data_species)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -9.290  -2.554  -0.124   2.208  15.677  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    40.60407    1.36701  29.703  < 2e-16 ***
## BIOMASS        -2.80045    0.23856 -11.739  < 2e-16 ***
## PHlow         -22.75667    1.83564 -12.397  < 2e-16 ***
## PHmid         -11.57307    1.86926  -6.191  2.1e-08 ***
## BIOMASS:PHlow  -0.02733    0.51248  -0.053    0.958    
## BIOMASS:PHmid   0.23535    0.38579   0.610    0.543    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 14.57959)
## 
##     Null deviance: 8338.3  on 89  degrees of freedom
## Residual deviance: 1224.7  on 84  degrees of freedom
## AIC: 504.37
## 
## Number of Fisher Scoring iterations: 2
summary(glm(RICHNESS~BIOMASS+PH,data=data_species,family=gaussian)) ## overdispersion
## 
## Call:
## glm(formula = RICHNESS ~ BIOMASS + PH, family = gaussian, data = data_species)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -8.965  -2.693   0.062   2.171  15.508  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.2449     1.0923   36.84   <2e-16 ***
## BIOMASS      -2.7276     0.1717  -15.89   <2e-16 ***
## PHlow       -22.6200     1.0818  -20.91   <2e-16 ***
## PHmid       -10.6418     1.0063  -10.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 14.31331)
## 
##     Null deviance: 8338.3  on 89  degrees of freedom
## Residual deviance: 1230.9  on 86  degrees of freedom
## AIC: 500.82
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(MOD.1)

## good enough but we prefer the next one:
glm.diag.plots(MOD.1)

# can interact with the plots to identify points (for example those with high influence or leverage) by specifying iden=TRUE inside call for glm.diag.plots
# in this case, record 18 may have high influence
# otherwise this looks pretty good

## try a model simplification
MOD.2 = update(MOD.1, ~. - BIOMASS:PH)
summary(MOD.2)
## 
## Call:
## glm(formula = RICHNESS ~ BIOMASS + PH, family = poisson, data = data_species)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5959  -0.6989  -0.0737   0.6647   3.5604  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.84894    0.05281  72.885  < 2e-16 ***
## BIOMASS     -0.12756    0.01014 -12.579  < 2e-16 ***
## PHlow       -1.13639    0.06720 -16.910  < 2e-16 ***
## PHmid       -0.44516    0.05486  -8.114 4.88e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 452.346  on 89  degrees of freedom
## Residual deviance:  99.242  on 86  degrees of freedom
## AIC: 526.43
## 
## Number of Fisher Scoring iterations: 4
## analyze variance difference with X2 because it is a glm: no analytic solution.
# when doing model simp with glm, must specify that you want a chi-squared rather than an F-test or you won't get a p-value
anova(MOD.1, MOD.2, test="Chi")
## Analysis of Deviance Table
## 
## Model 1: RICHNESS ~ BIOMASS * PH
## Model 2: RICHNESS ~ BIOMASS + PH
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        84     83.201                          
## 2        86     99.242 -2   -16.04 0.0003288 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## p-value is 0.0003288 so we have to keep the first model with the interaction


# could check whether PHmid and high are really different by lumping them and doing a Chi-squared likelihood ratio test
NEWPH<-factor(data_species$PH)
# generates a new object to manipulate, so that we don't accidentally mess up the original data
# have a look at it
levels(NEWPH)
## [1] "high" "low"  "mid"
# now recode the new factor so that you combine similar treatment levels
levels(NEWPH)[c(1,2)]<-"mid&high"
levels(NEWPH)[c(3)]<-"low"
levels(NEWPH)
## [1] "mid&high" "mid"      "low"
MOD.1.2LEVS<-glm(RICHNESS~BIOMASS*NEWPH,data=data_species,family=poisson)
anova(MOD.1,MOD.1.2LEVS,test="Chi")
## Analysis of Deviance Table
## 
## Model 1: RICHNESS ~ BIOMASS * PH
## Model 2: RICHNESS ~ BIOMASS * NEWPH
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        84      83.20                          
## 2        86     385.86 -2  -302.66 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# so must retain 3 levels

data_to_predict = copy(data_species)
data_to_predict[,RICHNESS:=NULL]

## glm so we use link and not response to get correct SE.
data_to_predict$PRED_RICHNESS = predict(MOD.1, data_to_predict, type="link", se.fit = TRUE)$fit
data_to_predict$PRED_SE = predict(MOD.1, data_to_predict, type="link", se.fit = TRUE)$se.fit

full_data = merge(data_species, data_to_predict, by=c("PH", "BIOMASS"))

## Get a 'tidy' version of the model coefficients
tidy(MOD.1)
##            term    estimate  std.error statistic      p.value
## 1   (Intercept)  3.76812359 0.06153074 61.239689 0.000000e+00
## 2       BIOMASS -0.10712979 0.01249004 -8.577218 9.719538e-18
## 3         PHlow -0.81557124 0.10283673 -7.930739 2.178459e-15
## 4         PHmid -0.33146257 0.09216810 -3.596283 3.227965e-04
## 5 BIOMASS:PHlow -0.15502818 0.04003169 -3.872636 1.076643e-04
## 6 BIOMASS:PHmid -0.03189202 0.02307575 -1.382058 1.669540e-01
## Get a 'tidy' version of the model summary statistics
glance(MOD.1)
##   null.deviance df.null    logLik      AIC      BIC deviance df.residual
## 1      452.3459      89 -251.1957 514.3913 529.3902 83.20114          84
## 'Augment' adds columns to the original data set
augment(MOD.1)
##    RICHNESS    BIOMASS   PH  .fitted    .se.fit       .resid       .hat
## 1        30 0.46929722 high 3.717848 0.05682696 -1.830938949 0.13296877
## 2        39 1.73087043 high 3.582696 0.04553397  0.498291437 0.07457890
## 3        44 2.08977848 high 3.544246 0.04283936  1.530422254 0.06352321
## 4        35 3.92578714 high 3.347555 0.03529456  1.188181582 0.03541932
## 5        25 4.36679265 high 3.300310 0.03550719 -0.412773078 0.03419316
## 6        29 5.48197468 high 3.180841 0.03961766  0.973834076 0.03777456
## 7        23 6.68468591 high 3.051995 0.04834758  0.394953850 0.04945543
## 8        18 7.51165063 high 2.963402 0.05592268 -0.313658951 0.06055714
## 9        19 8.13220251 high 2.896922 0.06213038  0.205492338 0.06993999
## 10       12 9.57212864 high 2.742663 0.07761122 -0.932960228 0.09353471
## 11       39 0.08665367 high 3.758840 0.06064703 -0.604616337 0.15778389
## 12       35 1.23697390 high 3.635607 0.04966633 -0.481254584 0.09355092
## 13       30 2.53204324 high 3.496866 0.03996309 -0.532490479 0.05272153
## 14       30 3.40794153 high 3.403032 0.03613154 -0.010049163 0.03923656
## 15       33 4.60504596 high 3.274786 0.03597417  1.228314946 0.03421394
## 16       20 5.36771709 high 3.193081 0.03898821 -0.912580761 0.03703430
## 17       26 6.56084215 high 3.065262 0.04730297  0.952655979 0.04797370
## 18       36 7.24206214 high 2.992283 0.05335179  3.229745189 0.05673227
## 19       18 8.50363299 high 2.857131 0.06600228  0.140250778 0.07584979
## 20        7 9.39095342 high 2.762073 0.07560245 -2.497794134 0.09049506
## 21       39 0.76488801 high 3.686181 0.05398068 -0.141794254 0.11624249
## 22       39 1.17647020 high 3.642089 0.05020129  0.133620993 0.09619860
## 23       34 2.32512082 high 3.519034 0.04124174  0.042669144 0.05740780
## 24       31 3.22288207 high 3.422857 0.03670348  0.061856062 0.04129926
## 25       24 4.13612930 high 3.325021 0.03528906 -0.738065195 0.03461932
## 26       25 5.13717652 high 3.217779 0.03785030  0.005483321 0.03577688
## 27       20 6.42193811 high 3.080143 0.04616488 -0.382881324 0.04637806
## 28       21 7.06552638 high 3.011195 0.05171784  0.151880746 0.05432834
## 29       12 8.74592918 high 2.831174 0.06857939 -1.272916762 0.07979041
## 30       11 9.98177013 high 2.698779 0.08220179 -1.050573561 0.10042167
## 31       29 0.17576270  mid 3.412226 0.06591165 -0.243782821 0.13177572
## 32       30 1.37677830  mid 3.245259 0.04975589  0.832477691 0.06354581
## 33       21 2.55104256  mid 3.082010 0.04121504 -0.172871886 0.03703493
## 34       18 3.00027434  mid 3.019557 0.04093925 -0.560147929 0.03432866
## 35       13 4.90562386  mid 2.754672 0.05717488 -0.706387722 0.05137473
## 36       13 5.34330542  mid 2.693825 0.06341117 -0.474866785 0.05946273
## 37        9 7.70000000  mid 2.366193 0.10271090 -0.521592047 0.11242367
## 38       24 0.55368893  mid 3.359686 0.06032583 -0.917585210 0.10473691
## 39       26 1.99029644  mid 3.159966 0.04404750  0.492315991 0.04572972
## 40       26 2.91263671  mid 3.031741 0.04084741  1.112242605 0.03459374
## 41       20 3.21645133  mid 2.989504 0.04146414  0.027824133 0.03417201
## 42       21 4.97988468  mid 2.744348 0.05819277  1.310060896 0.05267366
## 43       15 5.65872290  mid 2.649975 0.06820811  0.222767544 0.06584786
## 44        8 8.10000000  mid 2.310584 0.10987612 -0.679957356 0.12169724
## 45       31 0.73956986  mid 3.333845 0.05772211  0.548417154 0.09344472
## 46       28 1.52693420  mid 3.224384 0.04814977  0.560462271 0.05828014
## 47       18 2.23212239  mid 3.126347 0.04250878 -1.042132992 0.04118249
## 48       16 3.88528818  mid 2.896521 0.04556263 -0.506184647 0.03759766
## 49       19 4.62650541  mid 2.793476 0.05352309  0.641884730 0.04680298
## 50       20 5.12096844  mid 2.724735 0.06017423  1.159584577 0.05522789
## 51        6 8.30000000  mid 2.282780 0.11348814 -1.309789570 0.12626987
## 52       25 0.51127858  mid 3.365582 0.06093421 -0.751914501 0.10749200
## 53       23 1.47823269  mid 3.231154 0.04865739 -0.466202800 0.05991978
## 54       25 2.93455800  mid 3.028693 0.04086377  0.921702300 0.03451610
## 55       22 3.50548891  mid 2.949322 0.04280285  0.649392558 0.03497998
## 56       15 4.61790914  mid 2.794671 0.05341537 -0.340394747 0.04667053
## 57       11 5.69696382  mid 2.644659 0.06880396 -0.853512642 0.06664808
## 58       17 6.09301688  mid 2.589599 0.07512825  0.965268833 0.07520645
## 59       24 0.73006280  mid 3.335166 0.05785267 -0.790384806 0.09399206
## 60       27 1.15806838  mid 3.275664 0.05229781  0.104468680 0.07237189
## 61       18 0.10084790  low 2.926114 0.07952701 -0.152551401 0.11798398
## 62       19 0.13859609  low 2.916218 0.07847375  0.122435289 0.11374829
## 63       15 0.86351508  low 2.726175 0.06133523 -0.070409731 0.05746235
## 64       19 1.29291903  low 2.613603 0.05531438  1.366761454 0.04175914
## 65       12 2.46916355  low 2.305241 0.06219165  0.604286774 0.03878137
## 66       11 2.36655309  low 2.332142 0.06031538  0.215716251 0.03747118
## 67       15 2.62921708  low 2.263282 0.06547631  1.603817142 0.04121973
## 68        9 3.25228652  low 2.099940 0.08139677  0.287198669 0.05410209
## 69        3 4.41727619  low 1.794528 0.11836680 -1.363002368 0.08429941
## 70        2 4.78081039  low 1.699225 0.13083984 -1.707366692 0.09363885
## 71       18 0.05017529  low 2.939399 0.08095938 -0.209716455 0.12390739
## 72       19 0.48283691  low 2.825973 0.06950344  0.506384588 0.08152983
## 73       13 0.65266714  low 2.781450 0.06559076 -0.809819223 0.06944708
## 74        9 1.55533656  low 2.544808 0.05378741 -1.106826675 0.03686041
## 75        8 1.67163820  low 2.514319 0.05369325 -1.326024659 0.03562848
## 76       14 2.87005390  low 2.200145 0.07111828  1.530421843 0.04565410
## 77       13 2.51072052  low 2.294347 0.06300443  0.933582567 0.03937041
## 78        4 3.49760385  low 2.035628 0.08862409 -1.455832000 0.06014143
## 79        8 3.67876186  low 1.988136 0.09419877  0.254381201 0.06479413
## 80        2 4.83154245  low 1.685925 0.13260183 -1.680408098 0.09490721
## 81       17 0.28972266  low 2.876599 0.07438525 -0.180187597 0.09823444
## 82       14 0.07756009  low 2.932219 0.08018272 -1.153231604 0.12067202
## 83       15 1.42902041  low 2.577923 0.05429925  0.493282206 0.03883004
## 84       17 1.12074092  low 2.658741 0.05724708  0.699043227 0.04679342
## 85        9 1.50795384  low 2.557230 0.05392984 -1.148901072 0.03751904
## 86        8 2.32596318  low 2.342783 0.05962741 -0.779018752 0.03701301
## 87       12 2.99570582  low 2.167204 0.07434044  1.045237944 0.04826830
## 88       14 3.53819909  low 2.024985 0.08985744  2.084710096 0.06117254
## 89        7 4.36454121  low 1.808353 0.11658280  0.355785743 0.08291586
## 90        3 4.87050789  low 1.675710 0.13395828 -1.105706760 0.09587449
##       .sigma      .cooksd   .std.resid
## 1  0.9776705 8.942034e-02 -1.966330396
## 2  0.9995953 3.703543e-03  0.517980877
## 3  0.9860477 3.072845e-02  1.581476575
## 4  0.9923656 9.622873e-03  1.209799880
## 5  1.0001489 1.013447e-03 -0.420016377
## 6  0.9952632 6.875409e-03  0.992765213
## 7  1.0002231 1.463750e-03  0.405098009
## 8  1.0005807 1.098361e-03 -0.323610423
## 9  1.0009377 5.781984e-04  0.213078754
## 10 0.9954167 1.520982e-02 -0.979913049
## 11 0.9985960 1.313559e-02 -0.658822021
## 12 0.9996724 4.280517e-03 -0.505479066
## 13 0.9994083 2.690766e-03 -0.547107935
## 14 1.0012103 7.149918e-07 -0.010252308
## 15 0.9917669 9.958598e-03  1.249882736
## 16 0.9959938 5.201635e-03 -0.929963492
## 17 0.9954587 8.555446e-03  0.976363702
## 18 0.9323019 1.376726e-01  3.325452675
## 19 1.0010829 2.944200e-04  0.145892846
## 20 0.9590494 8.984423e-02 -2.619112238
## 21 1.0010741 4.949969e-04 -0.150831508
## 22 1.0010921 3.529723e-04  0.140552376
## 23 1.0011993 1.965449e-05  0.043949304
## 24 1.0011869 2.876098e-05  0.063174344
## 25 0.9978100 3.215164e-03 -0.751182433
## 26 1.0012108 1.929044e-07  0.005584122
## 27 1.0002856 1.211962e-03 -0.392081241
## 28 1.0010642 2.361845e-04  0.156182559
## 29 0.9905598 2.282262e-02 -1.326956322
## 30 0.9938014 2.075207e-02 -1.107661281
## 31 1.0007990 1.705968e-03 -0.261629756
## 32 0.9967483 8.828172e-03  0.860259219
## 33 1.0010242 1.964694e-04 -0.176164786
## 34 0.9992541 1.845662e-03 -0.570017335
## 35 0.9980411 4.465751e-03 -0.725263460
## 36 0.9997674 2.422300e-03 -0.489647779
## 37 0.9993650 6.126075e-03 -0.553640821
## 38 0.9955363 1.729173e-02 -0.969775179
## 39 0.9996816 2.097157e-03  0.503974128
## 40 0.9934710 8.276280e-03  1.131994904
## 41 1.0012061 4.736588e-06  0.028312078
## 42 0.9902504 1.864949e-02  1.345989517
## 43 1.0008913 6.364257e-04  0.230485235
## 44 0.9980387 1.128820e-02 -0.725536981
## 45 0.9992128 5.896260e-03  0.575988597
## 46 0.9992020 3.568686e-03  0.577544585
## 47 0.9943724 7.518141e-03 -1.064278203
## 48 0.9996078 1.664725e-03 -0.515977345
## 49 0.9986068 3.724575e-03  0.657454505
## 50 0.9926107 1.523947e-02  1.192995754
## 51 0.9893266 4.068673e-02 -1.401241118
## 52 0.9973922 1.212320e-02 -0.795907141
## 53 0.9998189 2.380186e-03 -0.480830956
## 54 0.9959027 5.597182e-03  0.938033077
## 55 0.9985782 2.770846e-03  0.661057359
## 56 1.0004794 9.638560e-04 -0.348627257
## 57 0.9965038 8.584260e-03 -0.883460720
## 58 0.9951305 1.485992e-02  1.003750762
## 59 0.9970536 1.132927e-02 -0.830371778
## 60 1.0011402 1.540186e-04  0.108467382
## 61 1.0010522 5.813100e-04 -0.162434384
## 62 1.0011092 3.652559e-04  0.130055303
## 63 1.0011793 5.312320e-05 -0.072524264
## 64 0.9894120 1.590694e-02  1.396224899
## 65 0.9989226 2.717093e-03  0.616356525
## 66 1.0009200 3.207064e-04  0.219875072
## 67 0.9849367 2.254352e-02  1.637929883
## 68 1.0006862 8.591152e-04  0.295297854
## 69 0.9889287 2.534302e-02 -1.424360313
## 70 0.9816686 4.181418e-02 -1.793395864
## 71 1.0009089 1.164318e-03 -0.224056475
## 72 0.9995297 4.300167e-03  0.528381923
## 73 0.9969616 8.176734e-03 -0.839493896
## 74 0.9935284 7.273783e-03 -1.127807557
## 75 0.9901797 9.813296e-03 -1.350297339
## 76 0.9863338 2.289611e-02  1.566600459
## 77 0.9957370 6.810231e-03  0.952521424
## 78 0.9875494 1.981979e-02 -1.501689028
## 79 1.0007946 8.240611e-04  0.263045818
## 80 0.9822599 4.129312e-02 -1.766315225
## 81 1.0009943 6.443764e-04 -0.189748378
## 82 0.9920691 3.152138e-02 -1.229818693
## 83 0.9996866 1.781782e-03  0.503147540
## 84 0.9981217 4.453096e-03  0.715995869
## 85 0.9929251 7.959080e-03 -1.171079994
## 86 0.9974120 3.711912e-03 -0.793848633
## 87 0.9942801 1.084810e-02  1.071415435
## 88 0.9729594 6.300933e-02  2.151556608
## 89 1.0003801 2.179798e-03  0.371521498
## 90 0.9930415 2.007863e-02 -1.162855101
p_final = ggplot(full_data) + theme_bw() + geom_line(aes(x=BIOMASS, y=exp(PRED_RICHNESS), color=PH)) + geom_point(aes(x=BIOMASS, y=RICHNESS, color=PH)) + ylab("RICHNESS") + geom_ribbon(aes(x=BIOMASS, ymin = exp(PRED_RICHNESS-1.96*PRED_SE), ymax = exp(PRED_RICHNESS+1.96*PRED_SE), group=PH, fill=PH), alpha=.3)

print(p_final)

visreg(MOD.1, xvar = "BIOMASS", scale = "response", rug=FALSE, by="PH", overlay=TRUE)

Usefull Tools

Augment

comp = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_7_predict/compensationo.csv"))
print(summary(comp))
##       ROOT            FRUIT            GRAZING  
##  Min.   : 4.426   Min.   : 14.73   Grazed  :20  
##  1st Qu.: 6.083   1st Qu.: 41.15   Ungrazed:20  
##  Median : 7.123   Median : 60.88                
##  Mean   : 7.181   Mean   : 59.41                
##  3rd Qu.: 8.510   3rd Qu.: 76.19                
##  Max.   :10.253   Max.   :116.05
print(str(comp))
## Classes 'data.table' and 'data.frame':   40 obs. of  3 variables:
##  $ ROOT   : num  6.22 6.49 4.92 5.13 5.42 ...
##  $ FRUIT  : num  59.8 61 14.7 19.3 34.2 ...
##  $ GRAZING: Factor w/ 2 levels "Grazed","Ungrazed": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(comp)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

hist(comp$ROOT)

hist(comp$FRUIT)

# hist(comp$GRAZING)

## there was no interesting interaction in the model, we rebuild it.
comp_mod = lm(FRUIT~ROOT+GRAZING, data=comp)
tidy(comp_mod)
##              term   estimate std.error statistic      p.value
## 1     (Intercept) -127.82936  9.664095 -13.22725 1.349804e-15
## 2            ROOT   23.56005  1.148771  20.50892 8.408231e-22
## 3 GRAZINGUngrazed   36.10325  3.357396  10.75335 6.107286e-13
### std ggplot
comp_mod %>%
  augment() %>%
  ggplot(data = .,
         aes(x = ROOT,
             y = FRUIT,
             colour = GRAZING,
             fill = GRAZING)) +
  geom_point() +                  # points from original data frame
  geom_line(aes(y = .fitted)) +   # line from model fit
  geom_ribbon(aes(ymin = .fitted - (1.96 * .se.fit),
                  ymax = .fitted + (1.96 * .se.fit)),   # CIs from model fit
              alpha = 0.3,
              linetype = 0) +
  theme_bw()

## OR:
visreg(comp_mod) # by default plot is fixed with the median but can be changed with cond=

median(comp$ROOT)
## [1] 7.1235
visreg(comp_mod, xvar = "GRAZING", cond=list(ROOT=10))

visreg(comp_mod, xvar = "ROOT", by="GRAZING") 

visreg

isle = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_7_predict/isolation.csv"))
print(summary(isle))
##    INCIDENCE         AREA         ISOLATION    
##  Min.   :0.00   Min.   :0.153   Min.   :2.023  
##  1st Qu.:0.00   1st Qu.:2.248   1st Qu.:4.823  
##  Median :1.00   Median :4.170   Median :5.801  
##  Mean   :0.58   Mean   :4.319   Mean   :5.856  
##  3rd Qu.:1.00   3rd Qu.:6.431   3rd Qu.:7.191  
##  Max.   :1.00   Max.   :9.269   Max.   :9.577
print(str(isle))
## Classes 'data.table' and 'data.frame':   50 obs. of  3 variables:
##  $ INCIDENCE: int  1 0 1 0 0 1 1 0 1 1 ...
##  $ AREA     : num  7.93 1.93 2.04 4.78 1.54 ...
##  $ ISOLATION: num  3.32 7.55 5.88 5.93 5.31 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(isle)

hist(isle$ISOLATION)

hist(isle$INCIDENCE) ## yes or no. it is a binomial distribution

hist(isle$AREA)

anova(glm(INCIDENCE~AREA*ISOLATION,data=isle,binomial), glm(INCIDENCE~AREA+ISOLATION,data=isle,binomial), test="Chi")
## Analysis of Deviance Table
## 
## Model 1: INCIDENCE ~ AREA * ISOLATION
## Model 2: INCIDENCE ~ AREA + ISOLATION
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        46     28.252                     
## 2        47     28.402 -1 -0.15043   0.6981
## interaction is not present

mod_isol<-glm(INCIDENCE~AREA+ISOLATION,data=isle,binomial)
tidy(mod_isol)
##          term   estimate std.error statistic     p.value
## 1 (Intercept)  6.6416611 2.9217506  2.273179 0.023015414
## 2        AREA  0.5807241 0.2477832  2.343679 0.019094615
## 3   ISOLATION -1.3719411 0.4768563 -2.877053 0.004014078
augment(mod_isol)
##    INCIDENCE  AREA ISOLATION    .fitted   .se.fit      .resid        .hat
## 1          1 7.928     3.317  6.6949132 2.0227018  0.04972775 0.005052032
## 2          0 1.925     7.554 -2.6040881 0.9183918 -0.37778986 0.054102919
## 3          1 2.045     5.883 -0.2418876 0.5500852  1.28170739 0.074552669
## 4          0 4.781     5.932  1.2797485 0.6453764 -1.74649863 0.070912735
## 5          0 1.536     5.308  0.2513900 0.6953805 -1.28586241 0.118999304
## 6          1 7.369     4.934  4.1518597 1.3774172  0.17670661 0.028944873
## 7          1 8.599     2.882  7.6813735 2.3058717  0.03037217 0.002452271
## 8          0 2.422     8.771 -3.9851205 1.3800947 -0.19193498 0.034140261
## 9          1 6.403     6.092  2.0021724 0.9407992  0.50332754 0.092784859
## 10         1 7.199     6.977  1.2502609 1.1089470  0.70974782 0.212850070
## 11         0 2.654     7.748 -2.4468968 0.9247954 -0.40747612 0.062718734
## 12         1 4.128     4.297  3.1436593 1.0736025  0.29058813 0.045693463
## 13         0 4.175     8.516 -2.6172662 1.2021877 -0.37539441 0.091662047
## 14         1 7.098     3.318  6.2115403 1.8801232  0.06331091 0.007066774
## 15         0 2.392     9.292 -4.7173235 1.6160077 -0.13341180 0.022942858
## 16         1 8.690     5.923  3.5621465 1.4770007  0.23656988 0.058549997
## 17         1 4.667     4.997  2.4963108 0.8676017  0.39791975 0.052944010
## 18         1 2.307     2.434  4.6420870 1.8076900  0.13850212 0.030909148
## 19         1 1.053     6.842 -2.1336574 0.8187356  2.11922608 0.063462120
## 20         0 0.657     6.452 -1.8285671 0.8133978 -0.54584735 0.078906745
## 21         1 5.632     2.506  6.4742149 2.0131568  0.05552515 0.006236680
## 22         1 5.895     6.613  0.9923832 0.8226123  0.79412324 0.133514883
## 23         0 4.855     9.577 -3.6780033 1.6718829 -0.22342470 0.067230700
## 24         1 8.241     6.330  2.7430213 1.3347406  0.35323729 0.101247094
## 25         1 2.898     6.649 -0.7974368 0.5390587  1.52926965 0.062221216
## 26         1 4.445     5.032  2.3193722 0.8259026  0.43311817 0.055611762
## 27         1 6.027     2.023  7.3662485 2.2736828  0.03555379 0.003266466
## 28         0 1.574     5.737 -0.3151052 0.6214237 -1.04685717 0.094184587
## 29         0 2.700     5.762  0.3044916 0.5138626 -1.30915077 0.064507237
## 30         0 3.106     6.924 -1.0539300 0.6014247 -0.77335789 0.069330829
## 31         0 4.330     7.262 -0.8068398 0.7271068 -0.85905199 0.112801064
## 32         1 7.799     3.219  6.7544500 2.0370226  0.04827011 0.004828403
## 33         0 4.630     9.229 -3.3312307 1.5156249 -0.26504685 0.076575773
## 34         1 6.951     5.841  2.6647664 1.0909086  0.36687694 0.072425761
## 35         0 0.859     9.294 -5.6103175 1.7682844 -0.08547779 0.011366454
## 36         1 3.657     2.759  4.9801837 1.7151361  0.11704093 0.019952029
## 37         0 2.696     8.342 -3.2374394 1.1718580 -0.27753583 0.049937360
## 38         0 4.171     8.805 -3.0160800 1.3269769 -0.30929130 0.078423516
## 39         1 8.063     2.274  8.2042455 2.4540739  0.02338600 0.001647344
## 40         0 0.153     5.650 -1.0209553 0.8736936 -0.78443478 0.148625709
## 41         1 1.948     5.447  0.2999485 0.6224349  1.05297404 0.094710725
## 42         1 2.006     5.480  0.2883564 0.6102501  1.05766425 0.091193068
## 43         1 6.508     4.007  4.9236456 1.5175252  0.12038482 0.016513561
## 44         1 1.501     5.400  0.1048461 0.6803167  1.13322338 0.115390554
## 45         1 9.269     5.272  4.7915194 1.7185721  0.12857358 0.024119480
## 46         1 4.170     4.786  2.4971705 0.8829273  0.39775534 0.054791561
## 47         0 3.376     5.219  1.4420251 0.6608123 -1.81893826 0.067542292
## 48         0 2.228     7.990 -3.0262950 1.0598974 -0.30775245 0.049568579
## 49         0 1.801     8.466 -3.9273081 1.3033732 -0.19751027 0.032191600
## 50         1 6.441     3.451  5.6475363 1.7226654  0.08390460 0.010395824
##       .sigma      .cooksd  .std.resid
## 1  0.7857380 2.104652e-06  0.04985384
## 2  0.7836824 1.490976e-03 -0.38844394
## 3  0.7608212 3.695634e-02  1.33233376
## 4  0.7389636 9.846404e-02 -1.81192407
## 5  0.7593674 6.571249e-02 -1.36995523
## 6  0.7853275 1.610026e-04  0.17932088
## 7  0.7857596 3.789673e-07  0.03040948
## 8  0.7852446 2.267784e-04 -0.19529769
## 9  0.7819000 5.074612e-03  0.52843983
## 10 0.7768695 3.279867e-02  0.79997301
## 11 0.7833181 2.059977e-03 -0.42088863
## 12 0.7845475 7.212455e-04  0.29746366
## 13 0.7836234 2.703393e-03 -0.39388013
## 14 0.7857166 4.793163e-06  0.06353580
## 15 0.7855204 7.161084e-05 -0.13496908
## 16 0.7849497 6.248713e-04  0.24381523
## 17 0.7834562 1.621102e-03  0.40889111
## 18 0.7854985 1.057314e-04  0.14069354
## 19 0.7163724 2.036940e-01  2.18985125
## 20 0.7812850 4.980226e-03 -0.56874738
## 21 0.7857295 3.247510e-06  0.05569911
## 22 0.7756394 2.197352e-02  0.85311456
## 23 0.7850318 6.509706e-04 -0.23133644
## 24 0.7838496 2.689687e-03  0.37260304
## 25 0.7504828 5.235267e-02  1.57918812
## 26 0.7830198 2.043872e-03  0.44568818
## 27 0.7857549 6.929113e-07  0.03561200
## 28 0.7688543 2.792098e-02 -1.09993641
## 29 0.7600072 3.331542e-02 -1.35353487
## 30 0.7768320 9.300333e-03 -0.80164641
## 31 0.7741807 2.131787e-02 -0.91202962
## 32 0.7857400 1.894375e-06  0.04838707
## 33 0.7847194 1.070120e-03 -0.27581762
## 34 0.7837626 1.953355e-03  0.38093079
## 35 0.7856702 1.418741e-05 -0.08596776
## 36 0.7855790 4.758881e-05  0.11822630
## 37 0.7846501 7.240988e-04 -0.28473636
## 38 0.7843352 1.507979e-03 -0.32218255
## 39 0.7857648 1.506733e-07  0.02340529
## 40 0.7757101 2.462274e-02 -0.85015199
## 41 0.7686438 2.853890e-02  1.10668488
## 42 0.7685568 2.758454e-02  1.10946092
## 43 0.7855685 4.138763e-05  0.12139129
## 44 0.7654276 4.426004e-02  1.20486874
## 45 0.7855381 7.006852e-05  0.13015277
## 46 0.7834536 1.682790e-03  0.40912142
## 47 0.7350536 1.095115e-01 -1.88366378
## 48 0.7843927 8.870344e-04 -0.31567568
## 49 0.7852146 2.256495e-04 -0.20076823
## 50 0.7856740 1.247730e-05  0.08434416
visreg(mod_isol)

visreg(mod_isol, xvar = "ISOLATION") ## displays in logit scale

visreg(mod_isol, xvar = "AREA")

visreg(mod_isol, xvar = "ISOLATION", scale = "response", rug=FALSE, ylab="Incidence",xlab="Isolation")

visreg(mod_isol, xvar = "AREA", scale = "response", rug=FALSE)

Non-linear Models

decay = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_9_nonlinear_mods/decay.csv"))
print(summary(decay))
##       TIME         MASSLEFT      
##  Min.   : 0.0   Min.   :  8.196  
##  1st Qu.: 7.5   1st Qu.: 21.522  
##  Median :15.0   Median : 35.015  
##  Mean   :15.0   Mean   : 42.146  
##  3rd Qu.:22.5   3rd Qu.: 57.460  
##  Max.   :30.0   Max.   :125.000
print(str(decay))
## Classes 'data.table' and 'data.frame':   31 obs. of  2 variables:
##  $ TIME    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ MASSLEFT: num  125 100.2 70 83.5 100 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(decay)

hist(decay$TIME)

hist(decay$MASSLEFT)

ggplot(data=decay, aes(x=TIME, y=MASSLEFT)) + geom_boxplot() + theme_bw()

ggplot(data=decay, aes(x=TIME, y=MASSLEFT)) + geom_point() + theme_bw()

ggplot(data=decay, aes(x=TIME, y=log(MASSLEFT))) + geom_point() + theme_bw()

ggplot(data=decay, aes(x=TIME, y=log(MASSLEFT))) + geom_point() + theme_bw() + geom_smooth(method="lm")

par(mfrow=c(2,2))

lm.1 = lm(MASSLEFT~TIME, data=decay)
plot(lm.1)

## crappy

lm.2 = lm(log(MASSLEFT)~TIME, data=decay)
plot(lm.2)

## better

par(mfrow=c(1,1))

summary(lm.2)
## 
## Call:
## lm(formula = log(MASSLEFT) ~ TIME, data = decay)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5935 -0.2043  0.0067  0.2198  0.6297 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.547386   0.100295   45.34  < 2e-16 ***
## TIME        -0.068528   0.005743  -11.93 1.04e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.286 on 29 degrees of freedom
## Multiple R-squared:  0.8308, Adjusted R-squared:  0.825 
## F-statistic: 142.4 on 1 and 29 DF,  p-value: 1.038e-12
visreg(lm.2)

to_predict = seq(0, 30, .1)
pred_MASS = predict(lm.2, list(TIME=to_predict), int="c")

df_pred = cbind(data.frame(TIME=to_predict), exp(pred_MASS))


## now plot, do not forget to expo the predictions
plot(MASSLEFT~TIME, data=decay)
matlines(to_predict,exp(pred_MASS),lty=c(1,2,2),col="black")

## OR
ggplot(data=decay) + geom_point(aes(y=MASSLEFT, x=TIME)) + theme_bw() + geom_line(data=df_pred, aes(x=TIME, y=fit), color="blue") + geom_ribbon(data=df_pred, aes(x=TIME, ymin=lwr, ymax=upr), alpha=.2)

cairn = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_9_nonlinear_mods/CairngormNL.csv"))
print(summary(cairn))
##      REPID           GRP      ELEVATION         PLANT.HT    
##  Min.   : 1.00   FMJOKP:9   Min.   : 650.0   Min.   : 3.00  
##  1st Qu.:14.25   GCREGW:9   1st Qu.: 741.6   1st Qu.: 8.00  
##  Median :27.50   JTCMSY:9   Median : 832.9   Median :11.65  
##  Mean   :27.50   KAJJDM:9   Mean   : 851.0   Mean   :16.11  
##  3rd Qu.:40.75   LCAPKH:9   3rd Qu.: 946.1   3rd Qu.:23.38  
##  Max.   :54.00   MWCPJC:9   Max.   :1113.0   Max.   :43.00  
##   TRANSMISSION    
##  Min.   :0.05446  
##  1st Qu.:0.33716  
##  Median :0.84144  
##  Mean   :0.68820  
##  3rd Qu.:1.00000  
##  Max.   :1.17284
print(str(cairn))
## Classes 'data.table' and 'data.frame':   54 obs. of  5 variables:
##  $ REPID       : int  19 37 10 46 1 28 2 20 29 11 ...
##  $ GRP         : Factor w/ 6 levels "FMJOKP","GCREGW",..: 1 3 5 6 2 4 2 1 4 5 ...
##  $ ELEVATION   : num  650 651 651 655 657 ...
##  $ PLANT.HT    : num  36 27 26 36 32 26 43 22 18 28.5 ...
##  $ TRANSMISSION: num  0.333 0.278 0.169 0.158 0.254 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(cairn)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

par(mfrow=c(2,2))
hist(cairn$REPID)
hist(cairn$ELEVATION)
hist(cairn$PLANT.HT)
hist(cairn$TRANSMISSION)

par(mfrow=c(1,1))

ggplot(data=cairn, aes(x=ELEVATION, y=TRANSMISSION, group=PLANT.HT)) + geom_boxplot() + theme_bw()
## Warning: position_dodge requires constant width: output may be incorrect
## Warning: position_dodge requires non-overlapping x intervals

ggplot(data=cairn, aes(x=ELEVATION, y=TRANSMISSION, color=PLANT.HT)) + geom_point() + theme_bw()

par(mfrow=c(2,3))
lm.1 = lm(TRANSMISSION~ELEVATION*PLANT.HT, data=cairn)
plot(lm.1)
hist(lm.1$residuals)
summary(lm.1)
## 
## Call:
## lm(formula = TRANSMISSION ~ ELEVATION * PLANT.HT, data = cairn)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82590 -0.07902  0.02264  0.11380  0.47514 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         1.778e-01  4.474e-01   0.397   0.6928  
## ELEVATION           9.122e-04  5.086e-04   1.794   0.0789 .
## PLANT.HT           -2.099e-02  2.025e-02  -1.037   0.3049  
## ELEVATION:PLANT.HT  5.605e-06  2.512e-05   0.223   0.8243  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2077 on 50 degrees of freedom
## Multiple R-squared:  0.6618, Adjusted R-squared:  0.6415 
## F-statistic: 32.61 on 3 and 50 DF,  p-value: 7.996e-12
par(mfrow=c(1,2))

visreg(lm.1)
## Please note that you are attempting to plot a 'main effect' in a
## model that contains an interaction.  This is potentially
## misleading; you may wish to consider using the 'by' argument.
## 
## Conditions used in construction of plot
## PLANT.HT: 11.65
## Please note that you are attempting to plot a 'main effect' in a
## model that contains an interaction.  This is potentially
## misleading; you may wish to consider using the 'by' argument.
## 
## Conditions used in construction of plot
## ELEVATION: 832.85

## crappy summary

## try to remove interaction
lm.1.simple = update(lm.1, ~. -ELEVATION:PLANT.HT)
visreg(lm.1.simple)

anova(lm.1, lm.1.simple)
## Analysis of Variance Table
## 
## Model 1: TRANSMISSION ~ ELEVATION * PLANT.HT
## Model 2: TRANSMISSION ~ ELEVATION + PLANT.HT
##   Res.Df    RSS Df  Sum of Sq      F Pr(>F)
## 1     50 2.1564                            
## 2     51 2.1586 -1 -0.0021472 0.0498 0.8243
### same AND simpler we keep


mod.quad = lm(TRANSMISSION~PLANT.HT + poly(ELEVATION,2),data=cairn)
par(mfrow=c(2,3))
plot(mod.quad)
hist(mod.quad$residuals)
summary(mod.quad)
## 
## Call:
## lm(formula = TRANSMISSION ~ PLANT.HT + poly(ELEVATION, 2), data = cairn)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81938 -0.11044  0.01996  0.12300  0.40343 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.848282   0.072584  11.687 6.57e-16 ***
## PLANT.HT            -0.009936   0.004184  -2.375   0.0214 *  
## poly(ELEVATION, 2)1  1.321541   0.264673   4.993 7.61e-06 ***
## poly(ELEVATION, 2)2 -0.624061   0.276481  -2.257   0.0284 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1979 on 50 degrees of freedom
## Multiple R-squared:  0.6927, Adjusted R-squared:  0.6743 
## F-statistic: 37.58 on 3 and 50 DF,  p-value: 7.406e-13
anova(lm.1.simple, mod.quad)
## Analysis of Variance Table
## 
## Model 1: TRANSMISSION ~ ELEVATION + PLANT.HT
## Model 2: TRANSMISSION ~ PLANT.HT + poly(ELEVATION, 2)
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)  
## 1     51 2.1586                             
## 2     50 1.9589  1   0.19961 5.0948 0.0284 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
avPlots(mod.quad)

par(mfrow=c(1,1))
visreg(mod.quad, xvar="ELEVATION", by="PLANT.HT", overlay=TRUE)

# cairn$plant_scaled = round(cairn$PLANT.HT/10) + 1
# mod.quad = lm(TRANSMISSION~plant_scaled + poly(ELEVATION,2),data=cairn)
# visreg(mod.quad, xvar="ELEVATION", by="plant_scaled", overlay=TRUE)


### predictions
to_predict_ELEV = seq(600, 1200, 10)
to_predict_PLANT = seq(0, 50, 1)

to_predict = expand.grid(ELEVATION=to_predict_ELEV, PLANT.HT=to_predict_PLANT)

predicted = predict(mod.quad, to_predict, int="c")
df_pred = data.table(cbind(to_predict, predicted))

# ggplot(data=cairn) + geom_point(aes(y=TRANSMISSION, x=ELEVATION, color=PLANT.HT)) + theme_bw() + geom_line(data=df_pred, aes(x=ELEVATION, y=fit, group=PLANT.HT), alpha=.15) + geom_line(data=df_pred[PLANT.HT == mean(df_pred$PLANT.HT),], aes(x=ELEVATION, y=fit, group=PLANT.HT), color="blue", size=1.1) + scale_color_gradient(low="green", high="purple")

ggplot(data=cairn, aes(y=TRANSMISSION, x=ELEVATION, color=PLANT.HT)) + geom_point() + theme_bw() + stat_smooth(method = "lm", formula = y ~ x + I(x^2),se=FALSE) + geom_line(data=df_pred[PLANT.HT == median(df_pred$PLANT.HT),], aes(x=ELEVATION, y=fit, group=PLANT.HT), color="red", size=1.1)

deers = data.table(read.csv("~/Documents/formations/R - Ecosse/courses/Mod_9_nonlinear_mods/jaws.csv"))
print(summary(deers))
##       AGE             BONE       
##  Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:11.35   1st Qu.: 85.13  
##  Median :24.72   Median :103.85  
##  Mean   :24.80   Mean   : 93.98  
##  3rd Qu.:37.05   3rd Qu.:115.75  
##  Max.   :50.60   Max.   :142.00
print(str(deers))
## Classes 'data.table' and 'data.frame':   54 obs. of  2 variables:
##  $ AGE : num  0 5.11 1.32 35.24 1.63 ...
##  $ BONE: num  0 20.2 11.1 140.7 26.2 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
## NULL
ggpairs(deers)

hist(deers$AGE)

hist(deers$BONE)

ggplot(data=deers, aes(x=AGE, y=BONE)) + geom_boxplot() + theme_bw()

ggplot(data=deers, aes(x=AGE, y=BONE)) + geom_point() + theme_bw()

ggplot(data=deers, aes(x=AGE, y=log(BONE))) + geom_point() + theme_bw()

## it is decidely not a logtransform :-)

par(mfrow=c(2,3))
lm.1 = lm(BONE~AGE, data=deers)
plot(lm.1)
hist(lm.1$residuals)
## crappy

mod.quad = lm(BONE~poly(AGE,2),data=deers)
plot(mod.quad)

hist(mod.quad$residuals)
summary(mod.quad)
## 
## Call:
## lm(formula = BONE ~ poly(AGE, 2), data = deers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.175  -9.360   1.275   8.089  37.905 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     93.979      2.062  45.585  < 2e-16 ***
## poly(AGE, 2)1  182.082     15.150  12.019  < 2e-16 ***
## poly(AGE, 2)2 -118.949     15.150  -7.852 2.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 51 degrees of freedom
## Multiple R-squared:  0.8016, Adjusted R-squared:  0.7939 
## F-statistic: 103.1 on 2 and 51 DF,  p-value: < 2.2e-16
## a bit better, residuals are more homogeneous but still skewed.

## let's compare the models
anova(lm.1, mod.quad)
## Analysis of Variance Table
## 
## Model 1: BONE ~ AGE
## Model 2: BONE ~ poly(AGE, 2)
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)    
## 1     52 25854                                 
## 2     51 11705  1     14149 61.648 2.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(lm.1)
## [1] 492.4925
AIC(mod.quad)
## [1] 451.7008
## better AIC and RSS for quadatric model


to_predict = seq(0, 55, .5)
predicted = predict(mod.quad, list(AGE=to_predict), int="c")
df_pred = cbind(data.frame(AGE=to_predict), predicted)

ggplot(data=deers) + geom_point(aes(y=BONE, x=AGE)) + theme_bw() + geom_line(data=df_pred, aes(x=AGE, y=fit), color="blue") + geom_ribbon(data=df_pred, aes(x=AGE, ymin=lwr, ymax=upr), alpha=.2)
## FU***K the fit decreases after age...


### we will do a Nonlinear least squares regression
# we might like to use a different function that better approximates how growth is known to work: the asymptotic exponential function:
# 𝑦 = 𝑎 − 𝑏 ∗ 𝑒!!∗!

## but we need to guess the starting points

# In our case, a represents the asymptotic value of the function, which we can guess is around 120 by consulting a scatterplot

# We can deduce a starting value for b by setting age to zero, in which case the value of b is derived by rearranging the equation b = a – intercept. If the intercept is around 10, then we can guess that b is around 110.

# To guess parameter c, we must inspect the plot at the steepest part of the curve (in these data, when AGE is around 5, and bone is about 40). We can guess the value for c by rearranging the equation again given the values of a and b specified: c = -log ((a – y)/b)/x. When AGE is 5, BONE is 40 or so, and therefore
# c = -log ((120 – 40)/110)/5 = 0.063. A bit painful, but we now have three starting values. Use them to build a NLS model.

starting_points = list(a=120,b=110,c=0.063)
MOD.ASYMP<-nls(BONE~a-b*exp(-c*AGE), data=deers, start=starting_points)

## or use the auto function but still need to check for init param sensitivity
MOD.ASYMP.SS<-nls(BONE~SSasymp(AGE,a,b,c),data=deers)

library(nlstools)
## 
## 'nlstools' has been loaded.
## 
## IMPORTANT NOTICE: Most nonlinear regression models and data set examples
## related to predictive microbiolgy have been moved to the package 'nlsMicrobio'
plot(MOD.ASYMP)
plot(nlsResiduals(MOD.ASYMP))

plot(MOD.ASYMP.SS)

plot(nlsResiduals(MOD.ASYMP.SS))

summary(MOD.ASYMP)
## 
## Formula: BONE ~ a - b * exp(-c * AGE)
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a 115.2528     2.9139   39.55  < 2e-16 ***
## b 118.6875     7.8925   15.04  < 2e-16 ***
## c   0.1235     0.0171    7.22 2.44e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.21 on 51 degrees of freedom
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 2.625e-06
summary(MOD.ASYMP.SS)
## 
## Formula: BONE ~ SSasymp(AGE, a, b, c)
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a 115.2527     2.9139  39.553   <2e-16 ***
## b  -3.4348     8.1961  -0.419    0.677    
## c  -2.0915     0.1385 -15.101   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.21 on 51 degrees of freedom
## 
## Number of iterations to convergence: 0 
## Achieved convergence tolerance: 2.442e-07
anova(MOD.ASYMP, MOD.ASYMP.SS)
## Analysis of Variance Table
## 
## Model 1: BONE ~ a - b * exp(-c * AGE)
## Model 2: BONE ~ SSasymp(AGE, a, b, c)
##   Res.Df Res.Sum Sq Df Sum Sq F value Pr(>F)
## 1     51     8897.3                         
## 2     51     8897.3  0      0
## same RSS but coeff are differents

### there is a problem with coefficients
MOD.ASYMP.2<-nls(BONE~a*(1-exp(-c*AGE)), data=deers,start=list(a=120,c=0.064))
plot(nlsResiduals(MOD.ASYMP.2))

anova(MOD.ASYMP, MOD.ASYMP.2)
## Analysis of Variance Table
## 
## Model 1: BONE ~ a - b * exp(-c * AGE)
## Model 2: BONE ~ a * (1 - exp(-c * AGE))
##   Res.Df Res.Sum Sq Df  Sum Sq F value Pr(>F)
## 1     51     8897.3                          
## 2     52     8929.1 -1 -31.843  0.1825  0.671
## same but simpler

## let's plot
predicted = predict(MOD.ASYMP.2, list(AGE=to_predict)) # we cannot get confint
df_pred = cbind(data.frame(AGE=to_predict), predicted)

ggplot(data=deers) + geom_point(aes(y=BONE, x=AGE)) + theme_bw() + geom_line(data=df_pred, aes(x=AGE, y=predicted), color="blue")